Get to know the data

In [1]:
with open('conversion_data.csv') as f:
    for num, line in enumerate(f):
        if num > 5:
            break
        print (num, line)
0 "country","age","new_user","source","total_pages_visited","converted"

1 "UK",25,1,"Ads",1,0

2 "US",23,1,"Seo",5,0

3 "US",28,1,"Seo",4,0

4 "China",39,1,"Seo",5,0

5 "US",30,1,"Seo",6,0

In [2]:
import pandas as pd

df = pd.read_csv('conversion_data.csv')
df.describe(include='all')
Out[2]:
country age new_user source total_pages_visited converted
count 316200 316200.000000 316200.000000 316200 316200.000000 316200.000000
unique 4 NaN NaN 3 NaN NaN
top US NaN NaN Seo NaN NaN
freq 178092 NaN NaN 155040 NaN NaN
mean NaN 30.569858 0.685465 NaN 4.872966 0.032258
std NaN 8.271802 0.464331 NaN 3.341104 0.176685
min NaN 17.000000 0.000000 NaN 1.000000 0.000000
25% NaN 24.000000 0.000000 NaN 2.000000 0.000000
50% NaN 30.000000 1.000000 NaN 4.000000 0.000000
75% NaN 36.000000 1.000000 NaN 7.000000 0.000000
max NaN 123.000000 1.000000 NaN 29.000000 1.000000
In [3]:
df.head()
Out[3]:
country age new_user source total_pages_visited converted
0 UK 25 1 Ads 1 0
1 US 23 1 Seo 5 0
2 US 28 1 Seo 4 0
3 China 39 1 Seo 5 0
4 US 30 1 Seo 6 0
In [4]:
df[df.age>70]
Out[4]:
country age new_user source total_pages_visited converted
90928 Germany 123 0 Seo 15 1
154217 US 73 1 Seo 5 0
192644 US 77 0 Direct 4 0
208969 US 72 1 Direct 4 0
265167 US 79 1 Direct 1 0
295581 UK 111 0 Ads 10 1
In [5]:
df[df.age<20]
Out[5]:
country age new_user source total_pages_visited converted
28 Germany 17 0 Seo 1 0
39 UK 18 1 Ads 14 0
40 US 17 0 Seo 1 0
47 China 19 1 Direct 2 0
53 US 17 1 Seo 7 0
70 US 19 1 Ads 3 0
82 US 18 0 Seo 6 0
87 US 18 0 Direct 19 1
92 Germany 19 1 Direct 5 0
101 China 18 1 Ads 3 0
111 US 19 1 Ads 7 0
130 US 17 0 Ads 19 1
148 US 19 1 Ads 3 0
155 US 18 1 Direct 2 0
164 UK 19 1 Ads 1 0
182 US 19 0 Ads 9 1
186 China 17 1 Seo 4 0
192 US 17 0 Seo 4 0
197 UK 17 1 Seo 5 0
204 US 18 1 Seo 7 0
213 US 17 0 Seo 21 1
244 China 18 0 Direct 6 0
261 China 19 1 Seo 5 0
266 UK 19 0 Ads 15 1
276 UK 17 0 Ads 2 0
283 UK 18 0 Direct 6 0
285 US 19 1 Ads 5 0
307 US 17 1 Seo 5 0
310 UK 19 0 Ads 3 0
323 China 18 1 Seo 6 0
... ... ... ... ... ... ...
315834 UK 18 0 Seo 6 0
315847 China 19 1 Ads 4 0
315855 US 17 0 Seo 5 0
315888 US 19 0 Seo 1 0
315901 China 19 0 Direct 6 0
315944 US 18 1 Seo 4 0
315950 US 19 1 Seo 5 0
315956 UK 19 1 Seo 6 0
315966 US 18 1 Seo 2 0
315970 US 17 1 Seo 8 0
315981 US 19 1 Ads 2 0
315983 US 19 0 Ads 11 0
315989 US 19 0 Seo 4 0
315993 US 18 1 Ads 6 0
316007 China 17 1 Direct 1 0
316009 China 17 1 Direct 4 0
316015 US 19 0 Seo 2 0
316031 US 18 1 Ads 8 0
316033 US 17 1 Seo 4 0
316038 UK 18 0 Seo 2 0
316043 China 17 0 Ads 7 0
316087 US 19 1 Ads 19 1
316107 China 18 1 Seo 3 0
316128 US 17 0 Direct 13 1
316129 UK 17 0 Seo 12 0
316134 US 18 0 Seo 6 0
316151 US 17 1 Seo 4 0
316160 Germany 17 1 Ads 5 0
316163 UK 18 0 Seo 2 0
316189 China 17 0 Ads 6 0

25412 rows × 6 columns

Exploratory data analysis

In [6]:
from bokeh.charts import Histogram, output_notebook, show
from bokeh.resources import Resources

resource = Resources(mode='inline')
output_notebook(resources=resource)
p = Histogram(df['age'], bins=30, title="Age Distribution (30 bins)")
show(p)
BokehJS successfully loaded.
Out[6]:
<bokeh.io._CommsHandle at 0x11e3ddd68>
In [7]:
# Remove "strange" records
df = df[df.age<80]
df.head()
Out[7]:
country age new_user source total_pages_visited converted
0 UK 25 1 Ads 1 0
1 US 23 1 Seo 5 0
2 US 28 1 Seo 4 0
3 China 39 1 Seo 5 0
4 US 30 1 Seo 6 0
In [9]:
output_notebook(resources=resource)
p = Bar(df, 'country', values='converted', agg='mean', title="Conversion Rate by Country")
show(p)
BokehJS successfully loaded.
Out[9]:
<bokeh.io._CommsHandle at 0x12978b2b0>
In [9]:
import numpy as np
grouped = df.loc[:, ['country', 'converted']].groupby('country')

date_country = grouped.mean()
date_country.index
#grouped.groups
#grouped.sum()
# data_pages = grouped.aggregate(np.mean)
# data_pages
Out[9]:
Index(['China', 'Germany', 'UK', 'US'], dtype='object', name='country')
In [10]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np

ind = np.arange(len(df.country.unique()))
width = 0.5
plt.xkcd()
# fig = plt.figure()
fig, ax = plt.subplots()
ax.bar(ind, date_country.converted, width, color="black")

ax.set_title("Conversion Rate by Country")
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(date_country.index)
Out[10]:
[<matplotlib.text.Text at 0x120737b00>,
 <matplotlib.text.Text at 0x120747c50>,
 <matplotlib.text.Text at 0x12099dac8>,
 <matplotlib.text.Text at 0x1209a9518>]
In [10]:
output_notebook(resources=resource)
p = Bar(df, 'source', values='converted', agg='mean', color="wheat", title="Conversion Rate by Source")
show(p)
BokehJS successfully loaded.
Out[10]:
<bokeh.io._CommsHandle at 0x11fe93358>
In [11]:
output_notebook(resources=resource)
p = Bar(df, 'new_user', values='converted', agg='mean', color="green", title="Conversion Rate by New User")
show(p)
BokehJS successfully loaded.
Out[11]:
<bokeh.io._CommsHandle at 0x132eb0da0>
In [12]:
grouped = df.loc[:, ['age', 'converted']].groupby('age')
data_age = grouped.mean()
data_age["Age"] = data_age.index
# data_age

from bokeh.charts import Line, output_notebook, show
output_notebook(resources=resource)
p = Line(data_age, x='Age', y='converted', color="blue", title="Conversion Rate by Age",
       plot_width=900, plot_height=400)
show(p)
BokehJS successfully loaded.
Out[12]:
<bokeh.io._CommsHandle at 0x12611ea58>
In [14]:
grouped = df.loc[:, ['total_pages_visited', 'converted']].groupby('total_pages_visited')

#grouped.groups
#grouped.sum()
data_pages = grouped.aggregate(np.mean)
data_pages
Out[14]:
converted
total_pages_visited
1 0.000000
2 0.000228
3 0.000251
4 0.000780
5 0.001570
6 0.003437
7 0.006769
8 0.015238
9 0.033067
10 0.061068
11 0.123471
12 0.244537
13 0.400825
14 0.587521
15 0.740181
16 0.871818
17 0.923077
18 0.961219
19 0.989381
20 0.997531
21 1.000000
22 1.000000
23 1.000000
24 1.000000
25 1.000000
26 1.000000
27 1.000000
28 1.000000
29 1.000000
In [15]:
output_notebook(resources=resource)
p = Line(data_pages, title="Conversion Rate vs Total Pages Visited", legend="top_left", ylabel="Conversion Rate")
show(p)
BokehJS successfully loaded.
Out[15]:
<bokeh.io._CommsHandle at 0x117b49160>

Machine Learning

In [16]:
# from sklearn import preprocessing
# le_country = preprocessing.LabelEncoder()
# le_country.fit(df['country'])
# print (list(le_country.classes_))
# df['country_encoded'] = le_country.transform(df['country'])

# le_source = preprocessing.LabelEncoder()
# le_source.fit(df['source'])
# print (list(le_source.classes_))
# df['source_encoded'] = le_source.transform(df['source'])

country_encoded, country_index = pd.factorize(df['country'])
df['country_encoded'] = country_encoded

source_encoded, source_index = pd.factorize(df['source'])
df['source_encoded'] = source_encoded

df.head()
Out[16]:
country age new_user source total_pages_visited converted country_encoded source_encoded
0 UK 25 1 Ads 1 0 0 0
1 US 23 1 Seo 5 0 1 1
2 US 28 1 Seo 4 0 1 1
3 China 39 1 Seo 5 0 2 1
4 US 30 1 Seo 6 0 1 1
In [17]:
from sklearn.cross_validation import train_test_split
# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded', 'age', 'new_user', 'source_encoded', 'total_pages_visited']],
                                                    df.converted,
                                                    test_size=0.2,
                                                    random_state=1)
x_train.columns
Out[17]:
Index(['country_encoded', 'age', 'new_user', 'source_encoded',
       'total_pages_visited'],
      dtype='object')
In [18]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, oob_score=True)
clf.fit(x_train, y_train)
Out[18]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)
In [19]:
clf.oob_score_
Out[19]:
0.98453498209188872
In [20]:
clf.n_features_
Out[20]:
5
In [21]:
importance = pd.DataFrame({"feature": pd.Categorical(x_train.columns), "importance": clf.feature_importances_})
output_notebook(resources=resource)
p = Bar(importance, label="feature", values="importance", color="orange", title="Feature importance")
show(p)
BokehJS successfully loaded.
Out[21]:
<bokeh.io._CommsHandle at 0x12b62d198>
In [22]:
preds = clf.predict(x_test)
In [23]:
pd.crosstab(y_test, preds, rownames=['actual'], colnames=['preds'])
Out[23]:
preds 0 1
actual
0 60887 323
1 646 1384
In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
In [25]:
print ("Accuracy:", accuracy_score(y_test, preds) )
print ("Confusion Matrix:\n", confusion_matrix(y_test, preds) )
Accuracy: 0.984677419355
Confusion Matrix:
 [[60887   323]
 [  646  1384]]
In [26]:
fpr, tpr, thresholds = roc_curve(y_test, preds)
In [27]:
fpr
Out[27]:
array([ 0.        ,  0.00527692,  1.        ])
In [28]:
tpr
Out[28]:
array([ 0.       ,  0.6817734,  1.       ])
In [29]:
thresholds
Out[29]:
array([2, 1, 0])
In [30]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Random Forests')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
In [31]:
auc(fpr, tpr)
Out[31]:
0.83824824173905066
In [32]:
from bokeh.plotting import figure, show, output_notebook

output_notebook(resources=resource)

p = figure(title="Receiver Operating Characteristic",
           y_range=(0.0, 1.05))

p.line(fpr, tpr, legend="Random Forests")

show(p)
BokehJS successfully loaded.
Out[32]:
<bokeh.io._CommsHandle at 0x130244d68>
In [34]:
%matplotlib inline
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split

plt.figure(1, figsize=(12, 8))

RANDOM_STATE = 123
NTREES = 100

# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded', 
                                                               'age', 
                                                               'new_user', 
                                                               'source_encoded', 
                                                               'total_pages_visited']],
                                                    df.converted,
                                                    test_size=0.2,
                                                    random_state=RANDOM_STATE)

ensemble_clfs = [
    ("RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(warm_start=True, n_estimators=NTREES, oob_score=True,
                               max_features="sqrt",
                               random_state=RANDOM_STATE))
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 200

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1):
        clf.set_params(n_estimators=i)
        clf.fit(x_train, y_train)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
# plt.show()
Out[34]:
<matplotlib.legend.Legend at 0x12b981048>

Plot OOB Error for multiple types of random forests

In [35]:
%matplotlib inline
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split

plt.figure(1, figsize=(12, 8))

RANDOM_STATE = 123
NTREES = 100

# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded', 
                                                               'age', 
                                                               'new_user', 
                                                               'source_encoded', 
                                                               'total_pages_visited']],
                                                    df.converted,
                                                    test_size=0.2,
                                                    random_state=RANDOM_STATE)

ensemble_clfs = [
    ("RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(warm_start=True, n_estimators=NTREES, oob_score=True,
                               max_features="sqrt",
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(warm_start=True, n_estimators=NTREES, max_features='log2',
                               oob_score=True,
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features=None",
        RandomForestClassifier(warm_start=True, n_estimators=NTREES, max_features=None,
                               oob_score=True,
                               random_state=RANDOM_STATE))
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 200

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1):
        clf.set_params(n_estimators=i)
        clf.fit(x_train, y_train)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
# plt.show()
Out[35]:
<matplotlib.legend.Legend at 0x12b675160>

Removing total_page_viewed

In [36]:
# from sklearn.ensemble import RandomForestClassifier

# features = df.columns[[6,1,2,7]]
# list(features)

from sklearn.cross_validation import train_test_split
# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded', 'age', 'new_user', 'source_encoded']],
                                                    df.converted,
                                                    test_size=0.2,
                                                    random_state=1)
x_train.columns
Out[36]:
Index(['country_encoded', 'age', 'new_user', 'source_encoded'], dtype='object')
In [37]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
Out[37]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [38]:
importance = pd.DataFrame({"feature": pd.Categorical(x_train.columns), "importance": clf.feature_importances_})

from bokeh.charts import Bar, output_notebook, show
output_notebook(resources=resource)
p = Bar(importance, label="feature", values="importance", color="gray", title="Feature importance")
show(p)
BokehJS successfully loaded.
Out[38]:
<bokeh.io._CommsHandle at 0x122f87b00>
In [39]:
preds = clf.predict(x_test)
In [40]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
In [41]:
print ("Accuracy:", accuracy_score(y_test, preds) )
print ("Confusion Matrix:\n", confusion_matrix(y_test, preds) )
Accuracy: 0.967900063251
Confusion Matrix:
 [[61210     0]
 [ 2030     0]]
In [42]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, x_test, y_test)
scores.mean() 
Out[42]:
0.96742567915026678
In [ ]: